Data Preparation Steps

rider = read.csv("AmtrakBig_CA_Question-3.csv", stringsAsFactors = F)

glimpse(rider)
## Observations: 159
## Variables: 4
## $ Month     <chr> "Jan-05", "Feb-05", "Mar-05", "Apr-05", "May-05", "Jun…
## $ Ridership <int> 1709, 1621, 1973, 1812, 1975, 1862, 1940, 2013, 1596, …
## $ t         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ Season    <chr> "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug"…
rider$Date = strptime(paste("01", rider$Month), "%d %b-%y")
rider$Year = format(rider$Date, "%Y")

table(rider$Year, rider$Season)
##       
##        Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
##   2005   1   1   1   1   1   1   1   1   1   1   1   1
##   2006   1   1   1   1   1   1   1   1   1   1   1   1
##   2007   1   1   1   1   1   1   1   1   1   1   1   1
##   2008   1   1   1   1   1   1   1   1   1   1   1   1
##   2009   1   1   1   1   1   1   1   1   1   1   1   1
##   2010   1   1   1   1   1   1   1   1   1   1   1   1
##   2011   1   1   1   1   1   1   1   1   1   1   1   1
##   2012   1   1   1   1   1   1   1   1   1   1   1   1
##   2013   1   1   1   1   1   1   1   1   1   1   1   1
##   2014   1   1   1   1   1   1   1   1   1   1   1   1
##   2015   1   1   1   1   1   1   1   1   1   1   1   1
##   2016   1   1   1   1   1   1   1   1   1   1   1   1
##   2017   1   1   1   1   1   1   1   1   1   1   1   1
##   2018   0   0   0   1   1   0   0   1   0   0   0   0
# There is no missing gap in the dataset

rider_ts = ts(rider$Ridership, frequency = 12, start = c(2005, 1))
is.ts(rider_ts)
## [1] TRUE
rider_ts
##       Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec
## 2005 1709 1621 1973 1812 1975 1862 1940 2013 1596 1725 1676 1814
## 2006 1615 1557 1891 1956 1885 1623 1903 1997 1704 1810 1862 1875
## 2007 1705 1619 1837 1957 1917 1882 1933 1996 1673 1753 1720 1734
## 2008 1563 1574 1903 1834 1831 1776 1868 1907 1686 1779 1776 1783
## 2009 1548 1497 1798 1733 1772 1761 1792 1875 1571 1647 1673 1657
## 2010 1382 1361 1559 1608 1697 1693 1836 1943 1551 1687 1576 1700
## 2011 1397 1372 1708 1655 1763 1776 1934 2008 1616 1774 1732 1797
## 2012 1570 1413 1755 1825 1843 1826 1968 1922 1670 1791 1817 1847
## 2013 1599 1549 1832 1840 1846 1865 1966 1949 1607 1804 1850 1836
## 2014 1542 1617 1920 1971 1992 2010 2054 2097 1824 1977 1981 2000
## 2015 1683 1663 2008 2024 2047 2073 2127 2203 1708 1951 1974 1985
## 2016 1760 1771 2020 2048 2069 1994 2075 2027 1734 1917 1858 1996
## 2017 1778 1749 2066 2099 2105 2130 2223 2174 1931 2121 2076 2141
## 2018 1832 1838 2132
# acf(rider_ts) ; pacf(rider_ts)
#ggplot(NULL, aes(y = rider_ts, x = seq_along(rider_ts))) + geom_line()+geom_point() 


# Split TS data for Train and Test (year 2016, 2017 & 2018 Jan-Mar)
train = subset(rider_ts, end=length(rider_ts)-27)
test = subset(rider_ts, start=length(rider_ts)-26)

autoplot(rider_ts) ; cycle(rider_ts)

##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005   1   2   3   4   5   6   7   8   9  10  11  12
## 2006   1   2   3   4   5   6   7   8   9  10  11  12
## 2007   1   2   3   4   5   6   7   8   9  10  11  12
## 2008   1   2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3
autoplot(train) ; cycle(train)

##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2005   1   2   3   4   5   6   7   8   9  10  11  12
## 2006   1   2   3   4   5   6   7   8   9  10  11  12
## 2007   1   2   3   4   5   6   7   8   9  10  11  12
## 2008   1   2   3   4   5   6   7   8   9  10  11  12
## 2009   1   2   3   4   5   6   7   8   9  10  11  12
## 2010   1   2   3   4   5   6   7   8   9  10  11  12
## 2011   1   2   3   4   5   6   7   8   9  10  11  12
## 2012   1   2   3   4   5   6   7   8   9  10  11  12
## 2013   1   2   3   4   5   6   7   8   9  10  11  12
## 2014   1   2   3   4   5   6   7   8   9  10  11  12
## 2015   1   2   3   4   5   6   7   8   9  10  11  12
autoplot(test) ; cycle(test)

##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 2016   1   2   3   4   5   6   7   8   9  10  11  12
## 2017   1   2   3   4   5   6   7   8   9  10  11  12
## 2018   1   2   3

Methods Used

# Lag plots
gglagplot(rider_ts)

# There is evidence of seasonality at lag 12.
# Try seasonal differencing at lag 12

# Visualise the various components of TS
decompose(rider_ts) %>% plot()

# or plot(stl(rider_ts, s.window="periodic"))
# There is a downward trend from yer 2006 to 2010 and turned positive trend
# Try differencing by the order of d=1. Also try D=1 for seasonal effect
# Try AR(1)
# Try MA(1)


# Train dataset
# Making Trend Stationary with Differencing
train %>% 
  diff() %>%
  ggtsdisplay(smooth = T)

train %>% 
  diff() %>%
  diff(lag = 12) %>% # additional seasonal differencing at lag 12
  ggtsdisplay(smooth = T)

# ACF improved with additional seasonal differencing at lag 12.
# Have d=1 and D=1 for ARIMA

# Keeping the TS after Trend is made stationary with differencing
train010_010 = train %>% diff() %>% diff(lag = 12)

plot(train010_010)

# Augmented Dickey-Fuller Test
# Prior to Differencing
adfTest(train)
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: -0.1768
##   P VALUE:
##     0.5575 
## 
## Description:
##  Tue Sep  3 00:22:05 2019 by user:
# After Differencing
adfTest(train010_010)
## Warning in adfTest(train010_010): p-value smaller than printed p-value
## 
## Title:
##  Augmented Dickey-Fuller Test
## 
## Test Results:
##   PARAMETER:
##     Lag Order: 1
##   STATISTIC:
##     Dickey-Fuller: -10.4791
##   P VALUE:
##     0.01 
## 
## Description:
##  Tue Sep  3 00:22:05 2019 by user:
# p-value <0.05, 
# we reject the NULL hypothesis that it is no-stationary time series


# AR: Using arima p=1 (non-seasonal)
model_100 = Arima(train010_010, order=c(1,0,0))
checkresiduals(model_100)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 37.99, df = 22, p-value = 0.01837
## 
## Model df: 2.   Total lags used: 24
tsdisplay(residuals(model_100)) # showing the PACF

# The PACF spike in lag 1 is reduced, but the spike at lag 12 still exist.

# ggfortify::ggtsdiag(model_100)
# Ljung-Box Statistic still showing the residuals p-value <0.05, 
# rejecting the NULL Hypothesis,
# implying residual data are not independently distributed; they exhibit serial correlation.
# Note: ggfortify cause conflict with autoplot()

# Next do MA: Using arima p=1, q=1 (non-seasonal)
model_101 = Arima(train010_010, order=c(1,0,1))
checkresiduals(model_101)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1) with non-zero mean
## Q* = 28.259, df = 21, p-value = 0.1329
## 
## Model df: 3.   Total lags used: 24
tsdisplay(residuals(model_101)) # showing the PACF

# The ACF reduced. PACF also reduced, but the spike at lag 12 still exist,
# Try add a seasonal component at lag 12 (D=1)

# ggfortify::ggtsdiag(model_101)
# Ljung-Box Statistic now showed the residuals p-value > 0.05, 
# do not reject the NULL Hypothesis,
# implying residual data are independently distributed. 
# Note: ggfortify cause conflict with autoplot()


#Add Seasonal Components: Using arima Q=1
model_101_001 = Arima(train010_010, order=c(1,0,1), seasonal = c(0,0,1))
checkresiduals(model_101_001)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1)(0,0,1)[12] with non-zero mean
## Q* = 11.44, df = 20, p-value = 0.934
## 
## Model df: 4.   Total lags used: 24
tsdisplay(residuals(model_101_001)) # showing the PACF

# The spike at lag 12 dropped below the threshold.

#Add Seasonal Components: Using arima Q=2
model_101_002 = Arima(train010_010, order=c(1,0,1), seasonal = c(0,0,2))
checkresiduals(model_101_002)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1)(0,0,2)[12] with non-zero mean
## Q* = 10.414, df = 19, p-value = 0.942
## 
## Model df: 5.   Total lags used: 24
tsdisplay(residuals(model_101_002)) # showing the PACF

# There is no much changes to ACF and PCF, try P=1 instead.


#Add Seasonal Components: Using arima P=1, Q=1
model_101_101 = Arima(train010_010, order=c(1,0,1), seasonal = c(1,0,1))
checkresiduals(model_101_101)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,1)(1,0,1)[12] with non-zero mean
## Q* = 10.447, df = 19, p-value = 0.941
## 
## Model df: 5.   Total lags used: 24
tsdisplay(residuals(model_101_101)) # showing the PACF

# There is no much changes to ACF and PCF.

# Therefore, the ARIMA parameters for non-stationary train set wil be
# arima (p=1, d=1, q=1) * (P=0, D=1, Q=1)
model_111_011 = Arima(train, order=c(1,1,1), seasonal = c(0,1,1))
checkresiduals(model_111_011)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(0,1,1)[12]
## Q* = 12.319, df = 21, p-value = 0.9306
## 
## Model df: 3.   Total lags used: 24
tsdisplay(residuals(model_111_011)) # showing the PACF

summary(model_111_011)
## Series: train 
## ARIMA(1,1,1)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1     sma1
##       0.2904  -0.7377  -0.6973
## s.e.  0.1623   0.1189   0.0931
## 
## sigma^2 estimated as 4009:  log likelihood=-665.16
## AIC=1338.32   AICc=1338.67   BIC=1349.44
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 5.546197 59.35247 44.91013 0.2261564 2.549106 0.5574281
##                     ACF1
## Training set -0.02994887
coeftest(model_111_011)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1   0.290382   0.162321  1.7889   0.07363 .  
## ma1  -0.737721   0.118856 -6.2069 5.405e-10 ***
## sma1 -0.697325   0.093086 -7.4912 6.827e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(model_111_011$x, col="red") ; lines(fitted(model_111_011), col="blue")

# Test against Forecast for 27 months
model_111_011 %>%
  forecast(h=27) %>%
  autoplot()+autolayer(test)

# Try to reduce the range of confidence interval for the forecast 
# by increasing the AR by an additional seasonal order
# Therefore, the ARIMA parameters for non-stationary train set wil be
# arima (p=1, d=1, q=1) * (P=1, D=1, Q=1)
model_111_111 = Arima(train, order=c(1,1,1), seasonal = c(1,1,1))
checkresiduals(model_111_111)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 11.49, df = 20, p-value = 0.9325
## 
## Model df: 4.   Total lags used: 24
tsdisplay(residuals(model_111_111)) # showing the PACF

summary(model_111_111)
## Series: train 
## ARIMA(1,1,1)(1,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1
##       0.2953  -0.7395  0.0742  -0.7455
## s.e.  0.1625   0.1191  0.1540   0.1350
## 
## sigma^2 estimated as 4021:  log likelihood=-665.04
## AIC=1340.08   AICc=1340.61   BIC=1353.98
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 5.668452 59.18607 44.73699 0.2325546 2.538906 0.5552791
##                     ACF1
## Training set -0.03008069
coeftest(model_111_111)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1   0.295269   0.162540  1.8166   0.06928 .  
## ma1  -0.739541   0.119125 -6.2081 5.363e-10 ***
## sar1  0.074177   0.154017  0.4816   0.63008    
## sma1 -0.745521   0.134959 -5.5241 3.313e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(model_111_111$x, col="red") ; lines(fitted(model_111_111), col="blue")

# Test against Forecast for 27 months
model_111_111 %>% 
  forecast(h=27) %>%
  autoplot() + autolayer(test)

# Try auto.arima
fitautoarima = auto.arima(train)
checkresiduals(fitautoarima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(0,1,1)[12]
## Q* = 12.319, df = 21, p-value = 0.9306
## 
## Model df: 3.   Total lags used: 24
tsdisplay(residuals(fitautoarima)) # showing the PACF

summary(fitautoarima)
## Series: train 
## ARIMA(1,1,1)(0,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1     sma1
##       0.2904  -0.7377  -0.6973
## s.e.  0.1623   0.1189   0.0931
## 
## sigma^2 estimated as 4009:  log likelihood=-665.16
## AIC=1338.32   AICc=1338.67   BIC=1349.44
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 5.546197 59.35247 44.91013 0.2261564 2.549106 0.5574281
##                     ACF1
## Training set -0.02994887
coeftest(fitautoarima)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1   0.290382   0.162321  1.7889   0.07363 .  
## ma1  -0.737721   0.118856 -6.2069 5.405e-10 ***
## sma1 -0.697325   0.093086 -7.4912 6.827e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fitautoarima$x, col="red") ; lines(fitted(fitautoarima), col="blue")

# Result of auto.arima is ARIMA(1,1,1)(0,1,1)[12]
# Same as manually tune model_111_011


# Test against Forecast for 27 months
fitautoarima %>%
  forecast(h=27) %>%
  autoplot() + autolayer(test)

# Try ETS
model_ETS = ets(train)
checkresiduals(model_ETS)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 20.91, df = 10, p-value = 0.02173
## 
## Model df: 14.   Total lags used: 24
tsdisplay(residuals(model_ETS)) # showing the PACF

summary(model_ETS)
## ETS(A,N,A) 
## 
## Call:
##  ets(y = train) 
## 
##   Smoothing parameters:
##     alpha = 0.5245 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 1812.8525 
##     s = 28.7322 -7.9188 1.5296 -123.0332 202.4818 140.0204
##            42.4109 78.5713 54.4692 45.4712 -254.9488 -207.7858
## 
##   sigma:  60.6351
## 
##      AIC     AICc      BIC 
## 1743.417 1747.555 1786.659 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 2.079917 57.32948 45.03911 0.0288371 2.553083 0.5590291
##                    ACF1
## Training set 0.06755291
plot(model_ETS$x, col="red") ; lines(fitted(model_ETS), col="blue")

# Test against Forecast for 27 months
model_ETS %>%
  forecast(h=27) %>%
  autoplot() + autolayer(test)

# It is noted that the forecast did not track the test result closely

Comparison of Accuracy Achieved by Different Techniques

# Comparing the TS models, excluding fitautoarima which is same as model_111_011
# Predicting against Test data
pred_model_111_011 = predict(test, model = model_111_011)
pred_model_111_111 = predict(test, model = model_111_111)
pred_model_ETS = predict(test, model = model_ETS)
## Model is being refit with current smoothing parameters but initial states are being re-estimated.
## Set 'use.initial.values=TRUE' if you want to re-use existing initial values.
accuracy(pred_model_111_011)
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 6.190112 34.34011 21.48619 0.2563945 1.067993 0.2043709
##                   ACF1
## Training set 0.4854798
accuracy(pred_model_111_111)
##                    ME    RMSE      MAE       MPE     MAPE      MASE
## Training set 6.256073 34.5735 21.67848 0.2597599 1.077045 0.2061999
##                   ACF1
## Training set 0.4820497
accuracy(pred_model_ETS)
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 6.421122 30.68692 24.99541 0.2972348 1.253015 0.2377496
##                   ACF1
## Training set 0.4943721
# pre_model_111_011 has the lowest errors measures except for RMSE
# which is higher than pre_model_ETS. 

Ex Ante Forecast for 6 months

# Train the model based on the full set of data
modelF_111_011 = Arima(rider_ts, order = c(1,1,1), seasonal = c(0,1,1))
modelF_ETS = ets(rider_ts)

# Ex Ante Forecast for 6 months
fcst_111_011 = forecast(modelF_111_011, h=6)
fcst_ETS = forecast(modelF_ETS, h=6)


# save Ex Ante Forecast of the model in the new data frame 
# together with variable you want to plot against
F_111_011 = data.frame(fcst_111_011) %>% select(-2,-3)
F_111_011$Time = dmy(paste(1, rownames(F_111_011)))
F_111_011$Model = "ARIMA_111_011"
F_111_011 = F_111_011[,c(4,5,1,2,3)]
#F_111_011 = gather(F_111_011, "Type", "Forecast", 3:5)

F_ETS = data.frame(fcst_ETS) %>% select(-2,-3)
F_ETS$Time = dmy(paste(1, rownames(F_ETS)))
F_ETS$Model = "ETS"
F_ETS = F_ETS[,c(4,5,1,2,3)]
#F_ETS = gather(F_ETS, "Type", "Forecast", 3:5)

df = rbind(F_111_011, F_ETS, deparse.level = 0)

# Extract source ts as data frame
ts = window(rider_ts, start = 2017)
ds = data.frame(ts)
names(ds) = "Point.Forecast"
ds$Time = as.Date(ts)
ds$Point.Forecast = as.numeric(ds$Point.Forecast)
ds$Model = "Actual"
ds = ds[,c(2,3,1)]

ds_df = full_join(ds,df)
## Joining, by = c("Time", "Model", "Point.Forecast")
ds_df %>% ggplot(aes(x = Time, y=Point.Forecast, col = Model)) +
  geom_point() +
  geom_line() +
  geom_ribbon(aes(ymin=Lo.95,ymax=Hi.95, fill = Model), linetype = 0, alpha=.2) +
  
  theme_bw() +
  theme(legend.position = "bottom",
        axis.text = element_text(colour = "black")
  ) +
  
  labs(title="Ridership Analysis",
       subtitle= "6 months forecast of Ridership",
       y = "Count of Riders",
       x = "Date")

Findings

# Tabulate the Ridership estimate forecast for 6 months based on average of Point.Forecast of all forecast models
# For the maximum limit, obtain the highest of upper limit of 95% confidence internal of all forecast models
# For the minimum limit, obtain the highest of lower limit 95% confidence internal of all forecast models
x_table = df %>% 
  group_by(Time) %>%
  summarise(Estimates = round(mean(Point.Forecast)), 
            Min = round(max(Lo.95)), 
            Max = round(max(Hi.95)))

x_table$Time = format(x_table$Time, "%Y %b")

library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:forecast':
## 
##     gghistogram
table1 <- ggtexttable(x_table, 
                      theme = ttheme(
                        tbody.style = tbody_style(color = "black", face = "plain", size = 12, 
                                                  fill = "transparent", linewidth = 1, linecolor = "grey"),
                        colnames.style = colnames_style(color = "black", face = "bold", size = 12,
                                                        fill = "transparent", linewidth = 1, linecolor = "grey"),
                        rownames.style = colnames_style(color = "black", face = "plain", size = 12,
                                                        fill = "transparent", linewidth = 1, linecolor = "grey")))

table1